{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "8cd3f128-866a-4857-a00a-df19f926c952",
   "metadata": {
    "tags": []
   },
   "source": [
    "# Evaporate Demo"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "c9e4ffe4-f0eb-4850-8820-48e14ffcbe96",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "from llama_index import SimpleDirectoryReader, ServiceContext, LLMPredictor\n",
    "from llama_index.experimental.evaporate import EvaporateExtractor\n",
    "from llama_index.llms import OpenAI\n",
    "import requests"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "a299cad8-af81-4974-a3de-ed43877d3490",
   "metadata": {},
   "source": [
    "### Load data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "daf434f6-3b27-4805-9de8-8fc92d7d776b",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "wiki_titles = [\"Toronto\", \"Seattle\", \"Chicago\", \"Boston\", \"Houston\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "8438168c-3b1b-425e-98b0-2c67a8a58a5f",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "from pathlib import Path\n",
    "\n",
    "import requests\n",
    "\n",
    "for title in wiki_titles:\n",
    "    response = requests.get(\n",
    "        \"https://en.wikipedia.org/w/api.php\",\n",
    "        params={\n",
    "            \"action\": \"query\",\n",
    "            \"format\": \"json\",\n",
    "            \"titles\": title,\n",
    "            \"prop\": \"extracts\",\n",
    "            # 'exintro': True,\n",
    "            \"explaintext\": True,\n",
    "        },\n",
    "    ).json()\n",
    "    page = next(iter(response[\"query\"][\"pages\"].values()))\n",
    "    wiki_text = page[\"extract\"]\n",
    "\n",
    "    data_path = Path(\"data\")\n",
    "    if not data_path.exists():\n",
    "        Path.mkdir(data_path)\n",
    "\n",
    "    with open(data_path / f\"{title}.txt\", \"w\") as fp:\n",
    "        fp.write(wiki_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "c01dbcb8-5ea1-4e76-b5de-ea5ebe4f0392",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Load all wiki documents\n",
    "city_docs = {}\n",
    "for wiki_title in wiki_titles:\n",
    "    city_docs[wiki_title] = SimpleDirectoryReader(\n",
    "        input_files=[f\"data/{wiki_title}.txt\"]\n",
    "    ).load_data()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "b8e98279-b4c4-41ec-b696-13e6a6f841a4",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "llm = OpenAI(temperature=0, model=\"gpt-3.5-turbo\")\n",
    "service_context = ServiceContext.from_defaults(llm=llm, chunk_size=512)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "74c6c1c3-b797-45c8-b692-7a6e4bd1898d",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# get nodes for each document\n",
    "city_nodes = {}\n",
    "for wiki_title in wiki_titles:\n",
    "    docs = city_docs[wiki_title]\n",
    "    nodes = service_context.node_parser.get_nodes_from_documents(docs)\n",
    "    city_nodes[wiki_title] = nodes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "732084ea-3270-4bac-a8d5-f5631fa586ad",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# a list of nodes, one node per city, corresponding to intro paragraph\n",
    "city_pop_nodes = []\n",
    "city_pop_nodes = [city_nodes[\"Toronto\"][0], city_nodes[\"Seattle\"][0]]"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "bb369a78-e634-43f4-805e-52f6ea0f3588",
   "metadata": {},
   "source": [
    "### Evaporate Extractor Demo\n",
    "\n",
    "Here we demonstrate each function within the Evaporate Extractor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "5988a3c9-ad47-4463-a57d-e069aad60687",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "extractor = EvaporateExtractor(service_context)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "6c260836",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DataFrameRow(row_values=[\"[['2,794', '6,712']]\"])"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from llama_index.program.predefined.evaporate import EvaporateProgram\n",
    "\n",
    "program = EvaporateProgram.from_defaults(fields_to_extract=[\"population\"])\n",
    "program(training_data=city_nodes[\"Boston\"][:1], infer_data=city_nodes[\"Toronto\"][:1])"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "35173e7a-8e89-4897-a59b-3e31a7ef61e1",
   "metadata": {},
   "source": [
    "#### Extract Fields\n",
    "\n",
    "We demonstrate how to identify common fields across different chunks."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "a5414686-1f34-471d-9eab-dcfc1280d97d",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Try with just Boston\n",
    "boston_fields = extractor.identify_fields(city_nodes[\"Boston\"][:1], topic=\"Boston\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4882b5b8-618d-4f52-ab53-f413a7bd52b5",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['colleges and universities', 'area', 'population', 'firsts', 'state']"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "boston_fields"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1df3a7df-6d00-4487-b114-f45a6dba4764",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Try with Toronto and Seattle (should extract \"population\")\n",
    "existing_fields = extractor.identify_fields(\n",
    "    city_pop_nodes, topic=\"city\", fields_top_k=1\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d8a56bb6-3a26-40db-9ca3-8aa9ed4f2c52",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['population']"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "existing_fields"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "73bf1c9d-1946-4e6d-992f-b71d2c8ed562",
   "metadata": {},
   "source": [
    "#### Extract Functions from Fields"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2433c23a-c3c4-4dc1-901a-cac1e048e6ea",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "def get_fn_str_dict(nodes: list, existing_fields: set) -> dict:\n",
    "    fn_str_dict = {}\n",
    "    for field in existing_fields:\n",
    "        fn_str = extractor.extract_fn_from_nodes(nodes, field)\n",
    "        # fn_str = extractor.extract_fn_from_nodes(city_pop_nodes, field)\n",
    "        print(f\"Field: {field}\")\n",
    "        print(fn_str)\n",
    "        fn_str_dict[field] = fn_str\n",
    "    return fn_str_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7758b1bb-f26d-4b39-85e2-095829053380",
   "metadata": {
    "scrolled": true,
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 0 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 814 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 0 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 0 tokens\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Field: colleges and universities\n",
      "def get_colleges_and_universities_field(text: str):\n",
      "    \"\"\"\n",
      "    Function to extract colleges and universities. \n",
      "    \"\"\"\n",
      "    \n",
      "    # Use regex to find the colleges and universities field\n",
      "    pattern = r\"colleges and universities, notably (.*?),\"\n",
      "    colleges_and_universities_field = re.findall(pattern, text)\n",
      "    \n",
      "    # Return the result as a list\n",
      "    return colleges_and_universities_field[0].split(\" and \")\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 591 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 0 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 0 tokens\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Field: area\n",
      "def get_area_field(text: str):\n",
      "    \"\"\"\n",
      "    Function to extract area. \n",
      "    \"\"\"\n",
      "    \n",
      "    # Use regex to find the area field\n",
      "    area_field = re.findall(r'area of about (.*?) sq mi', text)\n",
      "    \n",
      "    # Return the result as a list\n",
      "    return area_field\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 589 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 0 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 0 tokens\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Field: population\n",
      "def get_population_field(text: str):\n",
      "    \"\"\"\n",
      "    Function to extract population. \n",
      "    \"\"\"\n",
      "    \n",
      "    # Use regex to find the population field\n",
      "    population_field = re.findall(r'population of (.*?) as', text)\n",
      "    \n",
      "    # Return the result as a list\n",
      "    return population_field\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 684 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 0 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 0 tokens\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Field: firsts\n",
      "def get_firsts_field(text: str):\n",
      "    \"\"\"\n",
      "    Function to extract firsts. \n",
      "    \"\"\"\n",
      "    \n",
      "    # Use regex to find the \"firsts\" field\n",
      "    pattern = r\"firsts\\s*include\\s*(.*?)\\.\"\n",
      "    firsts_field = re.findall(pattern, text, re.DOTALL)\n",
      "    \n",
      "    # Split the field into a list\n",
      "    firsts_list = firsts_field[0].split(',')\n",
      "    \n",
      "    # Strip whitespace from each item in the list\n",
      "    firsts_list = [item.strip() for item in firsts_list]\n",
      "    \n",
      "    return firsts_list\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 665 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 0 tokens\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Field: state\n",
      "def get_state_field(text: str):\n",
      "    \"\"\"\n",
      "    Function to extract state. \n",
      "    \"\"\"\n",
      "    \n",
      "    # Use regex to find the state field\n",
      "    pattern = r\"\\b(US:)\\b\"\n",
      "    matches = re.findall(pattern, text)\n",
      "    \n",
      "    # Return the result as a list\n",
      "    return list(matches)\n"
     ]
    }
   ],
   "source": [
    "boston_fn_str_dict = get_fn_str_dict(city_nodes[\"Boston\"][:1], boston_fields)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "f3c15344-77ea-4641-ae7a-50b7b239fd75",
   "metadata": {},
   "source": [
    "#### Run Function for each Field"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1de0759f-6adc-4ae9-a1b7-b2d660b4350b",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "def get_result_dict(nodes: list, fn_str_dict: dict) -> dict:\n",
    "    result_dict = {}\n",
    "    for field in fn_str_dict.keys():\n",
    "        fn_str = fn_str_dict[field]\n",
    "        result = extractor.run_fn_on_nodes(nodes, fn_str, field)\n",
    "        # result = extractor.run_fn_on_nodes(city_pop_nodes, fn_str, field)\n",
    "        result_dict[field] = result\n",
    "    return result_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "49f49fb6-2680-4a74-aa69-404dbebfc9df",
   "metadata": {},
   "outputs": [],
   "source": [
    "boston_result_dict = get_result_dict(city_nodes[\"Boston\"][:1], boston_fn_str_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1607f6be-f00d-47be-a56b-5c781b6bf626",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'colleges and universities': [['Harvard', 'MIT']],\n",
       " 'area': [['48.4']],\n",
       " 'population': [['675,647']],\n",
       " 'firsts': [[\"the United States' first public park (Boston Common\",\n",
       "   '1634)',\n",
       "   'first public or state school (Boston Latin School',\n",
       "   '1635) first subway system (Tremont Street subway',\n",
       "   '1897)',\n",
       "   'and first large public library (Boston Public Library',\n",
       "   '1848)']],\n",
       " 'state': [[]]}"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "boston_result_dict"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "800c3a9b-5661-4653-b4d8-4db0a54b45fb",
   "metadata": {},
   "source": [
    "### Try Running E2E"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9cee8b53-5c7c-4692-bd35-d1d8251ad1ed",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 0 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 631 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 0 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 0 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 597 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 0 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 0 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 695 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 0 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 0 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 591 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 0 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 0 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 651 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 0 tokens\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[{'colleges and universities': ['Harvard and MIT'],\n",
       "  'state': [],\n",
       "  'key events': ['the Boston Massacre',\n",
       "   'the Boston Tea Party',\n",
       "   'the Battle of Bunker Hill',\n",
       "   'and the siege of Boston.'],\n",
       "  'area': ['48.4'],\n",
       "  'firsts': [\"the United States' first public park (Boston Common\",\n",
       "   ' 1634)',\n",
       "   ' first public or state school (Boston Latin School',\n",
       "   ' 1635) first subway system (Tremont Street subway',\n",
       "   ' 1897)',\n",
       "   ' and first large public library (Boston Public Library',\n",
       "   ' 1848)']}]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "extractor.extract_datapoints_with_fn(city_nodes[\"Boston\"][:1], topic=\"Boston\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
